
###   analytical code for taboo topics paper
library(ggplot2)
library(ggrepel)
library(dplyr)
library(sqldf)
library(survival)
library(lme4)
library(lubridate)
library(xtable)
library(texreg)
library(weights)
library(survey)
library(xtable)
options(bitmapType='cairo') #run on cluster without x11

### set these globals
basePath = "/gscratch/comdata/users/kaylea/taboo/" ##change if running in new environment
dataPath = paste0(basePath, 'processed_data/')
rawDataPath = paste0(basePath, 'raw_data/')
knitrPath = paste0(basePath, 'knitr_rdata/')
figPath = paste0(basePath, 'figures/')

### should run consistently from here with no changes to vars

### Pick up helpers
source(paste0(basePath, 'libs/lib-00-utils.R'))

### load up datasets and attach in turn.
load(paste0(dataPath, 'dataset1.RData'))
load(paste0(dataPath, 'dataset2.RData'))

ls()
artDF.prot$pct.prot[is.na(artDF.prot$pct.prot)] <- 0
artDF.prot$everProtected <- case_when(artDF.prot$pct.prot == 0 ~ FALSE, TRUE ~ TRUE)

artDF$pct.prot <- artDF.prot$pct.prot


#examine some extreme values
#head(vqDF[order(vqDF$prediction),])
#tail(vqDF[order(vqDF$prediction),])
head(qDF[order(qDF$weighted_sum),])
tail(qDF[order(qDF$weighted_sum),])

numArtMonthYearQ <- length(qDF$yearMonth)
numArtMonthYearQ
numArtMonthYearV <- length(vDF$yearMonth)
numArtMonthYearV

#sumVQDF <- summary(vqDF$prediction)
#sumVQDF # summary fields are 1 - min, 2 - 25th 3 - mean? 4 - median? 5 - 75th? 6: max
qBreakdown.ctab <- (prop.table(table(qDF.CTab$ORES_quality_prediction, qDF.CTab$source)))
qBreakdown.ctab
qBreakdown.ngram <- (prop.table(table(qDF.NGram$ORES_quality_prediction, qDF.NGram$source)))
qBreakdown.ngram

uniqueInSample.NGram <- data.frame(title=unique(vDF.NGram$encodedTitle), source='ngram')
uniqueInSample.CTab <- data.frame(title=unique(vDF.CTab$encodedTitle), source='taboo')
uniqueInSample <- rbind(uniqueInSample.NGram, uniqueInSample.CTab)

uniqTab <- table(uniqueInSample$source) 
uniqTab
numArts <- (uniqTab[1] + uniqTab[2])

## 


##Reverts
revDF.clean$got_reverted <- as.numeric(revDF.clean$got_reverted)
artDF$revert_rate <- artDF$got_reverted/artDF$revid
artDF.ngram <- subset(artDF, artDF$source=='ngram')
artDF.taboo <- subset(artDF, artDF$source=='taboo')
medianTabRevertRate <- median(artDF.taboo$revert_rate)
medianNGramRevertRate <- median(artDF.ngram$revert_rate)
meanTabRevertRate <- mean(artDF.taboo$revert_rate)
meanNGramRevertRate <- mean(artDF.ngram$revert_rate)
utest.meanRevRate <- wilcox.test(revert_rate~source, artDF) ### from here
meanRevRate.pv <- scales::pvalue(utest.meanRevRate$p.value)
meanRevRate.ut <- utest.meanRevRate$statistic


##Damaging 

strippedDF <- data.frame("revid"=revDF.clean$revid, "encodedTitle"=revDF.clean$encodedTitle)

dmgDF <- merge(dmgDF, strippedDF, by="revid", all.x=TRUE)#lost our encoded titles along the way, adding them back
dmgDF$is_damaging <- case_when(dmgDF$prediction == 'True' ~ 1, TRUE ~ 0)


numEdits.d <- dmgDF %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
n.revs.d <- length(dmgDF$revid) ## total number of revisions
n.arts.d <- length(numEdits$encodedTitle) ## total number of articles
dmgDF.clean <- merge(dmgDF, numEdits, by="encodedTitle")
dmgDF.clean <- merge(dmgDF.clean, numEditors, by="encodedTitle")
dmgDF <- dmgDF.clean


artDF.dmg <- dmgDF %>% dplyr::group_by(encodedTitle, source) %>% dplyr::summarize(
        across(revid, length),
        across(is_damaging, sum)
)

artDF.dmg$pct_dmg <- artDF.dmg$is_damaging / artDF.dmg$revid
artDF.dmg.taboo <- subset(artDF.dmg, artDF.dmg$source=='taboo')
artDF.dmg.ngram <- subset(artDF.dmg, artDF.dmg$source=='ngram')
medianTabDmgRate <- median(artDF.dmg.taboo$pct_dmg)
medianNGramDmgRate <- median(artDF.dmg.ngram$pct_dmg)
meanTabDmgRate <- mean(artDF.dmg.taboo$pct_dmg)
meanNGramDmgRate <- mean(artDF.dmg.ngram$pct_dmg)
utest.meanDmgRate <- wilcox.test(pct_dmg ~source, artDF.dmg)
meanDmgRate.pv <- scales::pvalue(utest.meanDmgRate$p.value)
meanDmgRate.ut <- utest.meanDmgRate$statistic

##Quality
avgQualityDF <- qDF %>% dplyr::group_by(encodedTitle, source) %>% dplyr::summarize(
        across(weighted_sum, mean)
)

artDF <- merge(artDF, avgQualityDF, by='encodedTitle')
artDF$avg_quality <- artDF$weighted_sum
artDF$weighted_sum <- NULL # just to make sure there's no confusion

finalQualityDF <- subset(qDF, qDF$timestamp=='20210901000000')
finalQualityDF$page_id <- NULL
finalQualityDF$timestamp <- NULL
finalQualityDF$ORES_quality_prediction <- NULL
finalQualityDF$yearMonth <- NULL
finalQualityDF$qualityRankInMyMonth <- NULL
finalQualityDF$source <- NULL
finalQualityDF$date <- NULL
finalQualityDF$birthday <- NULL
finalQualityDF$monthsOld <- NULL
finalQualityDF$yearsOld <- NULL
finalQualityDF$numQualityReadings <- NULL
finalQualityDF$weight <- NULL

artDF <- merge(artDF, finalQualityDF, by='encodedTitle')
artDF$final_quality <- artDF$weighted_sum
artDF$weighted_sum <- NULL # just to make sure there's no confusion
artDF$source <- artDF$source.x

##new fields, so make new subset
artDF.taboo <- subset(artDF, artDF$source=='taboo')
artDF.ngram <- subset(artDF, artDF$source=='ngram')

medianTabQuality <- median(artDF.taboo$avg_quality)
medianNGramQuality <- median(artDF.ngram$avg_quality)
meanTabQuality <- mean(artDF.taboo$avg_quality)
meanNGramQuality <- mean(artDF.ngram$avg_quality)
utest.meanQuality <- wilcox.test(avg_quality~source, artDF)
meanQuality.pv <- scales::pvalue(utest.meanQuality$p.value)
meanQuality.ut <- utest.meanQuality$statistic



### category reporting

propSex <- prop.table(table(artCatDF$isSex))
propSex
propRel <- prop.table(table(artCatDF$isRel))
propRel
propPol <- prop.table(table(artCatDF$isPol))
propPol


##R. User identifiability descriptive statistics reporting.

editorsDF <- revDF.clean %>% group_by(editor_id_or_ip) %>% dplyr::summarize(entriesFromThisEditor = n(), anon=anon, loggedIn=loggedIn) # add cols we want to keep to end
editorsDF <- unique(editorsDF)
head(editorsDF)
tail(editorsDF)
### checkpoint:
### note that if there are edits attributed to editor_id_or_ip of "" that's a sign of missing data, probably due to redirect targets not being followed



artDF$everProtected <- case_when(artDF$pct.prot == 0 ~ FALSE, TRUE ~ TRUE)
t <- data.frame(encodedTitle=artDF$encodedTitle, pct.prot=artDF$pct.prot, everProtected=artDF$everProtected)
t <- unique(t)
revDF.clean <- merge(revDF.clean, t, by='encodedTitle', all.x=TRUE)


artRevSumDF <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize( ##not unique editors
	numEdits = n(), 
numAnonEdits=sum(as.numeric(as.logical(anon))),
avgEditorEditCount = mean(editor_nth_edit_nocollapse),
	medianEditorEditCount = median(editor_nth_edit_nocollapse)
	)

## to get anon count

artRevSumDF$ident.prop=(artRevSumDF$numEdits-artRevSumDF$numAnonEdits)/artRevSumDF$numEdits
artRevSumDF$anon.prop=artRevSumDF$numAnonEdits/artRevSumDF$numEdits
##merge this in

artDF <- merge(artDF, artRevSumDF, by='encodedTitle')

#m.who.anon <- glm(anon.prop ~ source + pct.prot, family=binomial(link='logit'), data=artDF)
#m.who.anon.noprot <- glm(anon.prop ~ source, family=binomial(link='logit'), data=artDF)
m.who.anon <- glmer(as.logical(anon) ~ source + pct.prot + (1|encodedTitle), family=binomial(link='logit'), data=revDF.clean)
summary(m.who.anon)

m.who.ident <- glm(ident.prop ~ source + pct.prot, family=binomial(link='logit'), data=artDF)
summary(m.who.ident)

m.who.exp <- lm(log(avgEditorEditCount) ~ source + pct.prot, data=artDF)
summary(m.who.exp)

medianTabExp <- median(subset(artDF, artDF$source=='taboo')$avgEditorEditCount)
medianNGramExp <- median(subset(artDF, artDF$source=='ngram')$avgEditorEditCount)
meanTabExp <- mean(subset(artDF, artDF$source=='taboo')$avgEditorEditCount)
meanNGramExp <- mean(subset(artDF, artDF$source=='ngram')$avgEditorEditCount)
utest.meanExp <- wilcox.test(avgEditorEditCount~source, artDF)
meanExp.pv <- scales::pvalue(utest.meanExp$p.value)
meanExp.ut <- utest.meanExp$statistic

revDF.clean.acct <- subset(revDF.clean, revDF.clean$anon == "false")
revDF.clean.acct$hasUserpage <- case_when(revDF.clean.acct$userpage_text_chars == 0 ~ FALSE, TRUE ~ TRUE)

revDF.clean.acct <- unique(revDF.clean.acct)


n.acctholders <- length(unique(revDF.clean.acct$editor_id_or_ip))
n.nonzero <- length(unique(subset(revDF.clean.acct, revDF.clean.acct$hasUserpage != 0)$editor_id_or_ip))


con <- textConnection("m.who.anon.texreg", "w") #how we save R objects to strings
sink(con, split=TRUE, type="output")

texreg(m.who.anon, stars=NULL,
       digits=4,
       omit.coef = 'factor',
       custom.coef.names = c(NA, 'Taboo', 'Protection Level'), #NA means leave as is in default
       custom.model.names = c('Contributing Without an Account'),
       use.packages=FALSE,
       table=FALSE,
       ci.force = TRUE)

sink()
close(con);rm(con)


## note that this is logged
con <- textConnection("m.who.exp.texreg", "w") #how we save R objects to strings
sink(con, split=TRUE, type="output")

texreg(m.who.exp, stars=NULL,
       digits=4,
       omit.coef = 'factor',
       custom.coef.names = c(NA, 'Taboo', 'Protection Level'), #NA means leave as is in default
       custom.model.names = c('Contribution Count Model'),
       use.packages=FALSE,
       table=FALSE,
       ci.force = TRUE)

sink()
close(con);rm(con)



#R. Growth over time
growthAgeM <- lm(weighted_sum ~ source*monthsOld, data=qDF)
summary(growthAgeM)

growthAgeM

con <- textConnection("growthAge.texreg", "w") #how we save R objects to strings
sink(con, split=TRUE, type="output")

texreg(growthAgeM, stars=NULL,
       digits=4,
       #omit.coef = 'factor',
       use.packages=FALSE,
	custom.model.names = ('Quality by Taboo and Article Age'),
       custom.coef.names = c(NA, 'Taboo', 'Months Old', 'Taboo : Months Old'), #NA means leave as is in default
       table=FALSE,
       ci.force = TRUE)

sink()
close(con);rm(con)

viewRankM <- lm(log(viewRankInMyMonth) ~ source, data=vDF)
summary(viewRankM)
viewRankM

con <- textConnection("viewRankTime.texreg", "w") #how we save R objects to strings
sink(con, split=TRUE, type="output")
texreg(viewRankM, 
	stars=NULL,
       	digits=4,
       	omit.coef = 'yearMonth',
       	use.packages=FALSE,
        custom.model.names = ('Log-Rank of Article Views by Source'),
	custom.coef.names = c(NA, 'Taboo'),
       	table=FALSE,
       	ci.force = TRUE)

sink()
close(con);rm(con)




revCount = length(revDF.clean$revid)
revCount.ngram = length(which(revDF.clean$source == 'ngram'))
revCount.ctab = length(which(revDF.clean$source == 'taboo'))
contribCount = length(unique(revDF.clean$editor)) #editor name or IP


## gender


table(revDF.clean$gender)
userDF <- data.frame('editor'=revDF.clean$editor, 'gender' = revDF.clean$gender, 'emailable' = revDF.clean$emailable, 'anon'=revDF.clean$anon, 'source'=revDF.clean$source, 'userpage_text_chars'=revDF.clean$userpage_text_chars)
userDF <- subset(userDF, userDF$anon == 'false')
userDF <- unique(userDF)
dim(userDF)
userDFOLD <- userDF #just in case

##summariZe got overloaded
upDF <- userDF %>% group_by(editor) %>% summarise(hasUP = case_when(any(userpage_text_chars > 0) ~ TRUE, TRUE ~ FALSE))
upDF <- unique(upDF)
dim(upDF)
dim(userDF)
userDF <- merge(userDF, upDF, by='editor', all.x=TRUE)
dim(userDF)
userDF <- unique(userDF)
dim(userDF)
userDF$userpage_text_chars <- NULL #don't need anymore
dim(userDF)
userDF <- unique(userDF)
dim(userDF)

collapseDF <- userDF %>% group_by(editor) %>% summarise(ever_edited_taboo=any(source =='taboo')) %>% ungroup()
dim(collapseDF)

userDF <- merge(userDF, collapseDF, by='editor', all.x=TRUE)
dim(userDF)
userDF$source <- NULL
userDF <- unique(userDF)
dim(userDF)

table(userDF$ever_edited_taboo)

##H5C here

hasUPTab <- prop.table(table(userDF$hasUP, userDF$ever_edited_taboo), margin=1)
prop.table(table(userDF$hasUP, userDF$ever_edited_taboo), margin=2)
up.chi <- chisq.test(userDF$ever_edited_taboo, userDF$hasUP)
up.ts <- up.chi$statistic
up.pv <- scales::pvalue(up.chi$p.value)

#H5D here

table(userDF$gender)
prop.table(table(userDF$gender)) ### FIX: this is a manual copy atm
userDF$gaveGender <- case_when(userDF$gender=='unknown' ~ FALSE, TRUE ~ TRUE)
genderTab <- prop.table(table(userDF$ever_edited_taboo, userDF$gender), margin = 1)
gaveGender.chi <- chisq.test(userDF$ever_edited_taboo, userDF$gaveGender)
gaveGender.ts <- gaveGender.chi$statistic
gaveGender.pv <- scales::pvalue(gaveGender.chi$p.value)

gaveGenderDF <- subset(userDF, userDF$gender != 'unknown')
gaveGenderDF <- droplevels(gaveGenderDF)
gaveGenderTab <- prop.table(table(gaveGenderDF$ever_edited_taboo, gaveGenderDF$gender), margin = 1)
areFemale.chi <- chisq.test(gaveGenderDF$ever_edited_taboo, gaveGenderDF$gender)
areFemale.ts <- areFemale.chi$statistic
areFemale.pv <- scales::pvalue(areFemale.chi$p.value)


#H5E
#emailable 
table(userDF$emailable)
emailableTab <- prop.table(table(userDF$ever_edited_taboo, userDF$emailable), margin = 1)
emailable.chi <- chisq.test(userDF$ever_edited_taboo, userDF$emailable)
emailable.ts <- emailable.chi$statistic
emailable.pv <- scales::pvalue(emailable.chi$p.value)

mailGender.cor <- cor.test(as.numeric(userDF$emailable), as.numeric(userDF$gaveGender))
mailGender.cor.est <- mailGender.cor$estimate
mailGender.pv <- scales::pvalue(mailGender.cor$p.value) ##not enough obs




genderTab.ft <- ftable(genderTab)
genderTab.ft.xt <- xtableFtable(genderTab.ft, method = "compact", digits = 0)
print.xtableFtable(genderTab.ft.xt, rotate.colnames = TRUE)
gaveGenderTab.ft <- ftable(gaveGenderTab)
gaveGenderTab.ft.xt <- xtableFtable(gaveGenderTab.ft, method = "compact", digits = 0)
print.xtableFtable(gaveGenderTab.ft.xt, rotate.colnames = TRUE)
emailableTab.ft <- ftable(emailableTab)
emailableTab.ft.xt <- xtableFtable(emailableTab.ft, method = "compact", digits = 0)
print.xtableFtable(emailableTab.ft.xt, rotate.colnames = TRUE)

#anon

revDF.clean.taboo <- subset(revDF.clean, revDF.clean$source=='taboo')
revDF.clean.ngram <- subset(revDF.clean, revDF.clean$source=='ngram')
anonProp.ctab <- prop.table(table(revDF.clean.taboo$anon))
anonProp.ngram <- prop.table(table(revDF.clean.ngram$anon))
anonTab <- data.frame(rbind(anonProp.ngram, anonProp.ctab))
rownames(anonTab) <- c('Random', 'Taboo')
colnames(anonTab) <- c('With Account','Without Account') #dropped now


artDF$pct_revert = artDF$got_reverted/artDF$revid
revrev.cor <- cor.test(artDF$pct_revert, artDF$revid)
revrev.cor.est <- revrev.cor$estimate
revrev.pv <- scales::pvalue(revrev.cor$p.value)

meanNGramContrib <- mean(subset(artDF, artDF$source=='ngram')$revid) 
meanTabContrib <- mean(subset(artDF, artDF$source=='taboo')$revid) 
medianNGramContrib <- median(subset(artDF, artDF$source=='ngram')$revid) 
medianTabContrib <- median(subset(artDF, artDF$source=='taboo')$revid) 
utest.contrib <- wilcox.test(revid~source, artDF)
contrib.pv <- scales::pvalue(utest.contrib$p.value)
contrib.ut <- utest.contrib$statistic

## H1 XXXX

byArtViews <- vDF %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
	across(viewRankInMyMonth, mean)
	)

artDF <- merge(artDF, byArtViews, by='encodedTitle')
artDF$avgViewRank <- artDF$viewRankInMyMonth
artDF$viewRankInMyMonth <- NULL

medianTabViewRank <- median(subset(artDF, artDF$source=='taboo')$avgViewRank)
medianNGramViewRank <- median(subset(artDF, artDF$source=='ngram')$avgViewRank)
meanTabViewRank <- mean(subset(artDF, artDF$source=='taboo')$avgViewRank)
meanNGramViewRank <- mean(subset(artDF, artDF$source=='ngram')$avgViewRank)
utest.meanRank <- wilcox.test(avgViewRank~source, artDF)
meanRank.pv <- scales::pvalue(utest.meanRank$p.value)
meanRank.ut <- utest.meanRank$statistic

damageM.vol <- lm(got_reverted ~ source + revid, data=artDF) #control for contrib count
summary(damageM.vol)

damageM.vol

con <- textConnection("damageM.vol.texreg", "w") #how we save R objects to strings
sink(con, split=TRUE, type="output")

texreg(damageM.vol, stars=NULL,
       digits=4,
       #omit.coef = 'factor',
       use.packages=FALSE,
	custom.model.names = ('Revert Count'),
       custom.coef.names = c(NA, 'Taboo', 'Contribution Count'),
       table=FALSE,
       ci.force = TRUE)

sink()
close(con);rm(con)


## additional analysis of contributors

revDF.xtab <- xtabs(~source+anon+got_reverted, data=revDF.clean) 
revDF.xtab.fmt <- xtable(format(prop.table(ftable(revDF.xtab), margin=1)))


save(vDF, qDF, userDF, revDF.clean, atBirthDF, file=paste0(dataPath, "EDA.RData"), version=2)
save(artDF, artDF.prot, artDF.dmg, file=paste0(dataPath, "artDF.RData"), version=2)
 

if (!nosave) { ##produce file to be loaded up in .RTex. 50 mb limit
  r <- list()
remember(revDF.xtab.fmt)
remember(damageM.vol.texreg)
remember(revrev.cor.est) 
remember(revrev.pv) 
remember(genderTab)
remember(emailableTab)
remember(hasUPTab)
remember(up.ts)
remember(up.pv)
remember(gaveGenderTab)
remember(gaveGender.ts)
remember(gaveGender.pv)
remember(areFemale.ts)
remember(areFemale.pv)
remember(emailable.ts)
remember(emailable.pv)
	remember(isABot.tab)
  remember(numCats)
  remember(sex.prob)
  remember(sex.pv)
  remember(sex.est)
  remember(sex.z)
  remember(numArts) 
  remember(qBreakdown.ngram)
  remember(qBreakdown.ctab)
  remember(uniqTab)
  remember(anonTab)
  remember(m.who.exp)
  remember(m.who.anon.texreg)
  remember(m.who.exp.texreg)
  remember(growthAge.texreg)
  remember(viewRankTime.texreg)
  remember(revCount)
  remember(revCount.ngram)
  remember(revCount.ctab)
  remember(contribCount)
  remember(numArtMonthYearQ)
  remember(numArtMonthYearV)
	remember(medianTabViewRank)
	remember(medianNGramViewRank)
	remember(meanTabViewRank)
	remember(meanNGramViewRank)
	remember(utest.meanRank)
	remember(meanRank.pv) 
	remember(meanRank.ut) 
	remember(medianTabRevertRate)
	remember(medianNGramRevertRate)
	remember(meanTabRevertRate)
	remember(meanNGramRevertRate)
	remember(utest.meanRevRate)
	remember(meanRevRate.pv) 
	remember(meanRevRate.ut) 
	remember(medianTabDmgRate)
	remember(medianNGramDmgRate)
	remember(meanTabDmgRate)
	remember(meanNGramDmgRate)
	remember(utest.meanDmgRate)
	remember(meanDmgRate.pv) 
	remember(meanDmgRate.ut) 
remember(medianTabExp) 
remember(medianNGramExp)
remember(meanTabExp) 
remember(meanNGramExp)
remember(utest.meanExp) 
remember(meanExp.pv) 
remember(meanExp.ut) 
remember(n.acctholders) 
remember(n.nonzero) 
remember(medianNGramContrib) 
remember(medianTabContrib) 
remember(meanNGramContrib) 
remember(meanTabContrib) 
remember(utest.contrib) 
remember(contrib.pv)
remember(contrib.ut) 
	remember(medianTabQuality)
	remember(medianNGramQuality)
	remember(meanTabQuality)
	remember(meanNGramQuality)
	remember(utest.meanQuality)
	remember(meanQuality.pv) 
	remember(meanQuality.ut) 

  save(r, file=paste0(knitrPath, "knitr_data.RData"), version=2) 
  print("Remembrances complete.")
  rm(r)
}  

save.image(paste0(rawDataPath, 'postStandaloneEnvironment.RData'))

